Create chips for the majority of GB

This notebook samples from the majority of GB and dumps them to disk as TIF files to be loaded by TensorFlow.

import glob
import os
import requests
import shutil
import warnings
from random import shuffle

from itertools import product, chain
from pathlib import Path

import contextily
import dask
import dask_geopandas
import dask.bag
import dask.dataframe
import geopandas
import numpy
import pandas
import pygeos
import pyogrio
import xarray, rioxarray
import libpysal

from dask_geopandas.hilbert_distance import _hilbert_distance
from dask.distributed import Client, LocalCluster
from shapely.geometry import box
from shapely.ops import polygonize
from tqdm.auto import tqdm
from sqlalchemy import create_engine

import tools
client = Client(
    LocalCluster(n_workers=16, threads_per_worker=1)
)
client

Client

Client-f5b49697-8fdf-11ec-84d2-dfb21f7d7f99

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

Specs

specs = {
    'chip_size': 32,
    'bands': [1, 2, 3], #RGB
    'mosaic_p': (
        '/home/jovyan/work/urbangrammar_samba/'
        'ghs_composite_s2/GHS-composite-S2.vrt'
    ),
    'spsig_p': (
        '/home/jovyan/work/urbangrammar_samba/spatial_signatures/'
        'signatures/'
        'signatures_combined_levels_simplified_clip.gpkg'
    ),
    
    'points_temp': '/home/jovyan/work/chips_gb/temp/points/',
    'folder': (
        '/home/jovyan/work/chips_gb/32_shuffled/'
    ),
}

Load signatures and Sentinel2 mosaic

%%time
spsig = pyogrio.read_dataframe(specs['spsig_p'])
CPU times: user 848 ms, sys: 355 ms, total: 1.2 s
Wall time: 3.48 s
mosaic = rioxarray.open_rasterio(
    specs['mosaic_p'], chunks={'x': 1024, 'y': 1024}
)

Get range of coordinates from the mosaic

start_x = float(mosaic["x"].min())
start_y = float(mosaic["y"].min())
end_x = float(mosaic["x"].max())
end_y = float(mosaic["y"].max())

Create coordinates of individual points at set chip size. We are using only the southern half of the extent that covers the country up to Perth.

x_coords = numpy.arange(start_x, end_x, specs["chip_size"] * 10)
y_coords = numpy.arange(start_y, end_y, specs["chip_size"] * 10)
x_bag = dask.bag.from_sequence(x_coords)
y_bag = dask.bag.from_sequence(y_coords[:y_coords.shape[0]//2])
product_bag = x_bag.product(y_bag)
ddf = product_bag.to_dataframe(meta={"x": float, "y":float})
%%time
ddf.to_parquet("/home/jovyan/work/chips_gb/temp/coords/", overwrite=True)
CPU times: user 14.7 s, sys: 836 ms, total: 15.5 s
Wall time: 17.2 s
[None]

Create points from coordinates

ddf = dask.dataframe.read_parquet("/home/jovyan/work/chips_gb/temp/coords/")
ddf["geometry"] = dask_geopandas.points_from_xy(ddf, "x", "y", crs=27700)
gddf = dask_geopandas.from_dask_dataframe(ddf).set_crs(27700)
gddf.to_parquet(specs["points_temp"], overwrite=True)
client.restart()

Client

Client-a8ea1a6b-8fdd-11ec-80c7-ad6fa5f97e07

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

Filter points that fall on land.

  • Clip by the coastline

  • Spatial join

signatures = spsig[['signature_type', 'geometry']].set_index(_hilbert_distance(spsig, spsig.total_bounds, level=10))
points = dask_geopandas.read_parquet(specs["points_temp"])
points_within = dask_geopandas.sjoin(points, signatures, how="inner", op='within')
/tmp/ipykernel_199/3757199691.py:1: FutureWarning: The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.
  points_within = dask_geopandas.sjoin(points, signatures, how="inner", op='within')
points_within.to_parquet("/home/jovyan/work/chips_gb/temp/points_within/", overwrite=True)
client.restart()

Client

Client-a8ea1a6b-8fdd-11ec-80c7-ad6fa5f97e07

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

Create chip bounds polygons.

points_within = dask.dataframe.read_parquet("/home/jovyan/work/chips_gb/temp/points_within/", columns=["geometry"])
points_within["geometry"] = points_within["geometry"].map_partitions(geopandas.GeoSeries.from_wkb, meta=geopandas.GeoSeries())
points_within = dask_geopandas.from_dask_dataframe(points_within)
polygons = points_within.buffer(specs['chip_size'] * 10 / 2, cap_style=3)
points_within["geometry"] = polygons
points_within = points_within.repartition(partition_size="1MB")
points_within.to_parquet("/home/jovyan/work/chips_gb/temp/polygons/", overwrite=True)

Dump signatures to sorted partitioned parquet.

signatures = spsig[['signature_type', 'geometry']].set_index(_hilbert_distance(spsig, spsig.total_bounds, level=10))
signatures = dask_geopandas.from_geopandas(signatures.sort_index(), npartitions=1000)
signatures.to_parquet("/home/jovyan/work/chips_gb/temp/signatures")
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
/tmp/ipykernel_199/798855391.py in <module>
      1 signatures = dask_geopandas.from_geopandas(signatures.sort_index(), npartitions=1000)
----> 2 signatures.to_parquet("/home/jovyan/work/chips_gb/temp/signatures")

/opt/conda/lib/python3.9/site-packages/dask_geopandas/core.py in to_parquet(self, path, *args, **kwargs)
    585         from .io.parquet import to_parquet
    586 
--> 587         return to_parquet(self, path, *args, **kwargs)
    588 
    589     def to_feather(self, path, *args, **kwargs):

/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py in to_parquet(df, path, engine, compression, write_index, append, overwrite, ignore_divisions, partition_on, storage_options, custom_metadata, write_metadata_file, compute, compute_kwargs, schema, name_function, **kwargs)
    841     graph = HighLevelGraph.from_collections(meta_name, dsk, dependencies=(data_write,))
    842     if compute:
--> 843         return compute_as_if_collection(
    844             Scalar, graph, [(meta_name, 0)], **compute_kwargs
    845         )

/opt/conda/lib/python3.9/site-packages/dask/base.py in compute_as_if_collection(cls, dsk, keys, scheduler, get, **kwargs)
    315     schedule = get_scheduler(scheduler=scheduler, cls=cls, get=get)
    316     dsk2 = optimization_function(cls)(dsk, keys, **kwargs)
--> 317     return schedule(dsk2, keys, **kwargs)
    318 
    319 

/opt/conda/lib/python3.9/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
   2992                     should_rejoin = False
   2993             try:
-> 2994                 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
   2995             finally:
   2996                 for f in futures.values():

/opt/conda/lib/python3.9/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
   2144             else:
   2145                 local_worker = None
-> 2146             return self.sync(
   2147                 self._gather,
   2148                 futures,

/opt/conda/lib/python3.9/site-packages/distributed/utils.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
    307             return future
    308         else:
--> 309             return sync(
    310                 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
    311             )

/opt/conda/lib/python3.9/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
    370     else:
    371         while not e.is_set():
--> 372             wait(10)
    373 
    374     if error:

/opt/conda/lib/python3.9/site-packages/distributed/utils.py in wait(timeout)
    359     def wait(timeout):
    360         try:
--> 361             return e.wait(timeout)
    362         except KeyboardInterrupt:
    363             loop.add_callback(cancel)

/opt/conda/lib/python3.9/threading.py in wait(self, timeout)
    572             signaled = self._flag
    573             if not signaled:
--> 574                 signaled = self._cond.wait(timeout)
    575             return signaled
    576 

/opt/conda/lib/python3.9/threading.py in wait(self, timeout)
    314             else:
    315                 if timeout > 0:
--> 316                     gotit = waiter.acquire(True, timeout)
    317                 else:
    318                     gotit = waiter.acquire(False)

KeyboardInterrupt: 
client.restart()

Client

Client-7babfc26-7df9-11ec-a48a-f5d3c2b9d425

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

Filter polygons fully wihtin a single signature type.

polygons = dask_geopandas.read_parquet("/home/jovyan/work/chips_gb/temp/polygons/").set_crs(27700)
polygons_within = dask_geopandas.sjoin(polygons, spsig[['signature_type', 'geometry']], op="within")
/tmp/ipykernel_1234/3882422153.py:1: FutureWarning: The `op` parameter is deprecated and will be removed in a future release. Please use the `predicate` parameter instead.
  polygons_within = dask_geopandas.sjoin(polygons, spsig[['signature_type', 'geometry']], op="within")
polygons_within.to_parquet("/home/jovyan/work/chips_gb/temp/chip_bounds/", overwrite=True)
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
/opt/conda/lib/python3.9/site-packages/dask/dataframe/io/parquet/core.py:157: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  return self.engine.write_partition(
[None]
client.restart()

Client

Client-f5b49697-8fdf-11ec-84d2-dfb21f7d7f99

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

How many chips we have per type?

polygons_within = dask_geopandas.read_parquet("/home/jovyan/work/chips_gb/temp/chip_bounds/")
val_counts = polygons_within["signature_type"].value_counts().compute()
val_counts
0_0    766870
4_0    408232
7_0    206819
3_0     16205
5_0     10031
1_0      7241
2_0      1757
2_2      1716
2_1      1103
8_0      1080
9_0       812
6_0       736
9_2       272
9_4        46
9_1        30
9_5         5
9_6         1
Name: signature_type, dtype: int64

If you want to do sliding based augmentation, skip the following steps to the Augmentation part.

Get sample of max 50k chips per type.

chip_bds = dask_geopandas.read_parquet("/home/jovyan/work/chips_gb/temp/chip_bounds/", columns=["geometry", "signature_type"])
types = ['0_0', '1_0', '3_0', '4_0', '5_0', '6_0', '7_0', '8_0', '2_0', '2_1', '2_2', '9_0', '9_1', '9_2', '9_4', '9_5']
for t in types:
    df = chip_bds[chip_bds["signature_type"] == t].compute()
    n = 50_000 if len(df) > 50_000 else len(df)
    df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
    print(t)
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
0_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
1_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
3_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
4_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
5_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
6_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
7_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
8_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
2_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
2_1
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
2_2
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
9_0
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
9_1
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
9_2
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
9_4
9_5
/tmp/ipykernel_65930/477179734.py:4: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  df.sample(n=n, random_state=42).to_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
client.restart()

Client

Client-a865dce7-84e9-11ec-818a-634439067d6f

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

Create TIF chips sampled from the mosaic.

specs = {
    'chip_size': 32,
    'bands': [1, 2, 3], #RGB
    'mosaic_p': (
        '/home/jovyan/work/urbangrammar_samba/'
        'ghs_composite_s2/GHS-composite-S2.vrt'
    ),
    'folder': (
        '/home/jovyan/work/chips_gb/32/all/'
    ),
}
for t in types:
    df = geopandas.read_parquet(f"/home/jovyan/work/chips_gb/temp/chip_bounds_per_type/{t}.parquet")
    centroid = df.centroid
    df['X'] = centroid.x.astype(int)
    df['Y'] = centroid.y.astype(int)
    tools.spilled_bag_of_chips(df, specs, npartitions=16)
    print(t)
0_0
1_0
3_0
4_0
5_0
6_0
7_0
8_0
2_0
2_1
2_2
9_0
9_1
9_2
9_4
9_5

Split into train, validation and secret.

split = (.7, .15, .15)

for t in types:
    for subset in ["train", "validation", "secret"]:
        os.makedirs(f"{specs['folder'][:-4]}{subset}/{Path(t).stem}", exist_ok=True)
    files = glob.glob(specs["folder"] + t + "/*.tif")
    count = len(files)
    for f in tqdm(files[:int(count * split[0])]):
        f = Path(f)
        shutil.copy(f, str(f.parent.parent.parent) + "/train/" + f.parent.stem + "/" + f.name)
    for f in tqdm(files[int(count * split[0]):int(count * (split[0] + split[1]))]):
        f = Path(f)
        shutil.copy(f, str(f.parent.parent.parent) + "/validation/" + f.parent.stem + "/" + f.name)
    for f in tqdm(files[int(count * (split[0] + split[1])):]):
        f = Path(f)
        shutil.copy(f, str(f.parent.parent.parent) + "/secret/" + f.parent.stem + "/" + f.name)
    print(t, "done")
0_0 done
1_0 done
3_0 done
4_0 done
5_0 done
6_0 done
7_0 done
8_0 done
2_0 done
2_1 done
2_2 done
9_0 done
9_1 done
9_2 done
9_4 done
9_5 done
client.restart()
Exception in thread AsyncProcess Dask Worker process (from Nanny) watch process join:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.9/threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.9/site-packages/distributed/process.py", line 234, in _watch_process
    assert exitcode is not None
AssertionError
distributed.nanny - WARNING - Worker process still alive after 1 seconds, killing

Client

Client-b6623fc4-7e03-11ec-af24-f78fed6e375f

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

bds = polygons_within[~polygons_within["signature_type"].isin(["0_0", "4_0", "7_0"])].compute()
import xyzservices

bds.explore("signature_type",cmap="tab20", prefer_canvas=True,
            tiles=xyzservices.TileProvider.from_qms("EOX::Maps - Sentinel-2 cloudless")(max_zoom=22),
            max_zoom=22)
Make this Notebook Trusted to load map: File -> Trust Notebook

Augmentation

  1. Filter underrepresented signature types

  2. Build rook weights

  3. Get components

  4. Filter components

    • components have one feature - assign to one subset and skip shuffling

    • components have less than 20 features - assign to one subset and include in shuffling

    • components have more than 20 features - split and shuffle

      • compute hilbert curve

      • sort per hilbert distance

      • split into subsets along hilbert

  5. shuffle

    • get GeoDataFrames containing subsets

    • Union components within each gdf and store as bounds of shuffling

    • repeatedly translate chips

    • sjoin translated chips within bounds

  6. combine with singletons

  7. divide overrepresented signature types into subsets and combine with the rest

  8. sample Sentinel and dump TIFs to disk

  1. Filter underrepresented signature types

client.restart()

Client

Client-9cff7ab2-8430-11ec-8220-43e83ca9b87c

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

polygons_within = dask_geopandas.read_parquet("/home/jovyan/work/chips_gb/temp/chip_bounds/")
chips_to_shuffle = polygons_within[~polygons_within["signature_type"].isin(["0_0", "4_0", "7_0"])].compute()
  1. Build rook weights

%%time
W = libpysal.weights.Rook.from_dataframe(chips_to_shuffle, silence_warnings=True)
CPU times: user 3.33 s, sys: 593 ms, total: 3.92 s
Wall time: 3.47 s
  1. Get components

components = chips_to_shuffle.groupby(W.component_labels)
  1. Filter components

singleton_components = {}
small_components = {}
shuffle_components = {}
for k, v in tqdm(components.indices.items()):
    if len(v) == 1:
        singleton_components[k] = v
    elif len(v) < 20:
        small_components[k] = v
    else:
        shuffle_components[k] = v

Collect final dataframes in lists.

train = []
validation = []
secret = []
split = (.7, .15, .15)

components have one feature - assign to one subset and skip shuffling

singleton_indices = numpy.array(list(singleton_components.values())).flatten()
singleton_df = chips_to_shuffle.iloc[singleton_indices].sample(frac=1, random_state=42)
total = singleton_df.shape[0]
train.append(singleton_df.iloc[:int(total * split[0])])
validation.append(singleton_df.iloc[int(total * split[0]):int(total * (split[0] + split[1]))])
secret.append(singleton_df.iloc[int(total * (split[0] + split[1])):])

components have less than 20 features - assign to one subset and include in shuffling

train_to_shuffle = []
validation_to_shuffle = []
secret_to_shuffle = []
small_indices = list(small_components.values())
shuffle(small_indices)
total = len(small_indices)
train_to_shuffle.append(
    chips_to_shuffle.iloc[
        numpy.concatenate(small_indices[
        :int(total * split[0])
    ])]
)
validation_to_shuffle.append(
    chips_to_shuffle.iloc[
        numpy.concatenate(small_indices[
        int(total * split[0]):int(total * (split[0] + split[1]))
    ])]
)
secret_to_shuffle.append(
    chips_to_shuffle.iloc[
        numpy.concatenate(small_indices[
        int(total * (split[0] + split[1])):
    ])]
)

components have more than 20 features - split and shuffle

for component in tqdm(shuffle_components.values()):
    comp_df = chips_to_shuffle.iloc[component]
    
    # compute hilbert curve
    hilbert = _hilbert_distance(comp_df, comp_df.total_bounds, level=8)
    
    # sort per hilbert distance
    comp_df_sorted = comp_df.set_index(hilbert).sort_index().reset_index(drop=True)
    
    # split into subsets along hilbert
    total = comp_df_sorted.shape[0]
    train_to_shuffle.append(
        comp_df_sorted.iloc[
            :int(total * split[0])
        ]
    )    
    validation_to_shuffle.append(
        comp_df_sorted.iloc[
            int(total * split[0]):int(total * (split[0] + split[1]))
        ]
    )    
    secret_to_shuffle.append(
        comp_df_sorted.iloc[
            int(total * (split[0] + split[1])):
        ]
    )    

shuffle

  • get GeoDataFrames containing subsets

train_df_to_shuffle = pandas.concat(train_to_shuffle).reset_index(drop=True)
validation_df_to_shuffle = pandas.concat(validation_to_shuffle).reset_index(drop=True)
secret_df_to_shuffle = pandas.concat(secret_to_shuffle).reset_index(drop=True)

Union components within each gdf and store as bounds of shuffling

w_train = libpysal.weights.Rook.from_dataframe(train_df_to_shuffle, silence_warnings=True)
w_validation = libpysal.weights.Rook.from_dataframe(validation_df_to_shuffle, silence_warnings=True)
w_secret = libpysal.weights.Rook.from_dataframe(secret_df_to_shuffle, silence_warnings=True)
%%time
train_bounds = train_df_to_shuffle.dissolve(w_train.component_labels)
validation_bounds = validation_df_to_shuffle.dissolve(w_validation.component_labels)
secret_bounds = secret_df_to_shuffle.dissolve(w_secret.component_labels)
CPU times: user 4.47 s, sys: 214 ms, total: 4.69 s
Wall time: 4.19 s

repeatedly translate chips

shuffle_by = 60
chip_size = specs["chip_size"] * 10

train_shuffled = []
validation_shuffled = []
secret_shuffled = []

for x, y in tqdm(product(range(0, chip_size, shuffle_by), range(shuffle_by, chip_size, shuffle_by))):
    
    shuffled = train_df_to_shuffle.translate(xoff=x, yoff=y)
    train_shuffled.append(train_df_to_shuffle.set_geometry(shuffled))
    
    shuffled = validation_df_to_shuffle.translate(xoff=x, yoff=y)
    validation_shuffled.append(validation_df_to_shuffle.set_geometry(shuffled))
    
    shuffled = secret_df_to_shuffle.translate(xoff=x, yoff=y)
    secret_shuffled.append(secret_df_to_shuffle.set_geometry(shuffled))

sjoin translated chips within bounds

train_shuffled_df = pandas.concat(train_shuffled)
validation_shuffled_df = pandas.concat(validation_shuffled)
secret_shuffled_df = pandas.concat(secret_shuffled)
%%time
train_shuffled_within = train_shuffled_df.drop(columns="index_right").sjoin(train_bounds[["geometry"]], predicate="within")
validation_shuffled_within = validation_shuffled_df.drop(columns="index_right").sjoin(validation_bounds[["geometry"]], predicate="within")
secret_shuffled_within = secret_shuffled_df.drop(columns="index_right").sjoin(secret_bounds[["geometry"]], predicate="within")
CPU times: user 4.43 s, sys: 175 ms, total: 4.61 s
Wall time: 3.87 s

combine with singletons

train_chips = pandas.concat([train_df_to_shuffle, train_shuffled_within] + train).reset_index(drop=True)
validation_chips = pandas.concat([validation_df_to_shuffle, validation_shuffled_within] + validation).reset_index(drop=True)
secret_chips = pandas.concat([secret_df_to_shuffle, secret_shuffled_within] + secret).reset_index(drop=True)

keep only <50k per type

to_drop = []
for t in train_chips.signature_type.unique():
    mask = train_chips.signature_type == t
    if mask.sum() > 50_000:
        to_drop.append(list(train_chips[mask].sample(frac=1).iloc[35000:].index))
drop_train = list(chain(*to_drop))
train_chips_limited = train_chips.drop(drop_train, axis=0)
to_drop = []
for t in validation_chips.signature_type.unique():
    mask = validation_chips.signature_type == t
    if mask.sum() > 7500:
        to_drop.append(list(validation_chips[mask].sample(frac=1).iloc[7500:].index))
drop_validation = list(chain(*to_drop))
validation_chips_limited = validation_chips.drop(drop_validation, axis=0)

to_drop = []
for t in secret_chips.signature_type.unique():
    mask = secret_chips.signature_type == t
    if mask.sum() > 7500:
        to_drop.append(list(secret_chips[mask].sample(frac=1).iloc[7500:].index))
drop_secret = list(chain(*to_drop))
secret_chips_limited = secret_chips.drop(drop_secret, axis=0)

divide overrepresented signature types into subsets and combine with the rest

polygons_within = dask_geopandas.read_parquet("/home/jovyan/work/chips_gb/temp/chip_bounds/")
overrepresented_chips = polygons_within[polygons_within["signature_type"].isin(["0_0", "4_0", "7_0"])].compute()
randomized = overrepresented_chips.sample(frac=1, random_state=42)

for t in ["0_0", "4_0", "7_0"]:
    subset = randomized[randomized.signature_type == t].iloc[:50_000]
    total = subset.shape[0]
    train_chips_limited = train_chips_limited.append(subset.iloc[
        :int(total * split[0])
    ])
    validation_chips_limited = validation_chips_limited.append(subset.iloc[
        int(total * split[0]):int(total * (split[0] + split[1]))
    ])
    secret_chips_limited = secret_chips_limited.append(subset.iloc[
        int(total * (split[0] + split[1])):
    ])
train_chips_limited.to_parquet("/home/jovyan/work/chips_gb/slided_train_50k.pq")
validation_chips_limited.to_parquet("/home/jovyan/work/chips_gb/slided_validation_50k.pq")
secret_chips_limited.to_parquet("/home/jovyan/work/chips_gb/slided_secret_50k.pq")
/tmp/ipykernel_1234/783543889.py:1: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  train_chips_limited.to_parquet("/home/jovyan/work/chips_gb/slided_train_50k.pq")
/tmp/ipykernel_1234/783543889.py:2: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  validation_chips_limited.to_parquet("/home/jovyan/work/chips_gb/slided_validation_50k.pq")
/tmp/ipykernel_1234/783543889.py:3: UserWarning: this is an initial implementation of Parquet/Feather file support and associated metadata.  This is tracking version 0.1.0 of the metadata specification at https://github.com/geopandas/geo-arrow-spec

This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

To further ignore this warning, you can do: 
import warnings; warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
  secret_chips_limited.to_parquet("/home/jovyan/work/chips_gb/slided_secret_50k.pq")

sample Sentinel and dump TIFs to disk

specs["folder"] = '/home/jovyan/work/chips_gb/64_shuffled/train/'

centroid = train_chips_limited.centroid
train_chips_limited['X'] = centroid.x.astype(int)
train_chips_limited['Y'] = centroid.y.astype(int)
tools.spilled_bag_of_chips(train_chips_limited, specs, npartitions=14)
client.restart()
distributed.worker - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to the OS; see https://distributed.dask.org/en/latest/worker.html#memtrim for more information. -- Unmanaged memory: 6.34 GiB -- Worker memory limit: 7.85 GiB

Client

Client-5677aca9-88d2-11ec-a8c7-39c90bf7da5f

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

specs["folder"] = '/home/jovyan/work/chips_gb/64_shuffled/validation/'

centroid = validation_chips_limited.centroid
validation_chips_limited['X'] = centroid.x.astype(int)
validation_chips_limited['Y'] = centroid.y.astype(int)
tools.spilled_bag_of_chips(validation_chips_limited, specs, npartitions=14)
client.restart()

Client

Client-5677aca9-88d2-11ec-a8c7-39c90bf7da5f

Connection method: Cluster object Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status

Cluster Info

specs["folder"] = '/home/jovyan/work/chips_gb/64_shuffled/secret/'

centroid = secret_chips_limited.centroid
secret_chips_limited['X'] = centroid.x.astype(int)
secret_chips_limited['Y'] = centroid.y.astype(int)
tools.spilled_bag_of_chips(secret_chips_limited, specs, npartitions=14)

TODO: check if they’re alright

import xyzservices

train_chips_limited[~train_chips_limited["signature_type"].isin(["0_0", "4_0", "7_0"])].iloc[:10000].explore("signature_type", style_kwds=dict(fill=False),cmap="tab20", prefer_canvas=True,
            tiles=xyzservices.TileProvider.from_qms("EOX::Maps - Sentinel-2 cloudless")(max_zoom=22))
client.shutdown()
train_chips_limited[~train_chips_limited["signature_type"].isin(["0_0", "4_0", "7_0"])].
geometry index_right signature_type X Y
0 POLYGON ((489181.267 182910.753, 489181.267 18... 69214 3_0 488861 182590
1 POLYGON ((489181.267 183550.753, 489181.267 18... 69214 3_0 488861 183230
2 POLYGON ((489821.267 182270.753, 489821.267 18... 69214 3_0 489501 181950
3 POLYGON ((489821.267 182910.753, 489821.267 18... 69214 3_0 489501 182590
4 POLYGON ((426461.267 420990.753, 426461.267 42... 39881 3_0 426141 420670
... ... ... ... ... ...
188131 POLYGON ((379101.267 405630.753, 379101.267 40... 24882 3_0 378781 405310
188132 POLYGON ((519901.267 186110.753, 519901.267 18... 68512 2_2 519581 185790
188134 POLYGON ((252381.267 55550.753, 252381.267 549... 641 3_0 252061 55230
188135 POLYGON ((526941.267 106750.753, 526941.267 10... 49528 1_0 526621 106430
188136 POLYGON ((265821.267 663550.753, 265821.267 66... 35420 1_0 265501 663230

127604 rows × 5 columns